package com.customization.api.pdf2word;


import com.baidu.aip.ocr.AipOcr;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.*;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.text.PDFTextStripper;

import java.awt.image.BufferedImage;
import java.io.*;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.customization.commons.Console;
import org.json.JSONObject;

import javax.imageio.ImageIO;


/**
 * @author liutaihong
 * @version 1.0.0
 * @ClassName DemoApplication.java
 * @Description TODO
 * @createTime 2020-05-27 14:25:00
 */
public class DemoApplication {
    //设置APPID/AK/SK
    public static final String APP_ID = "20088153";
    public static final String API_KEY = "j4m5lEFtYdaatOstrF9SzyLd";
    public static final String SECRET_KEY = "MM88YvkTxFo1qPdaz4b0SnbS38Q43aK6";
    public static final String DATE_FORMAT = "yyyy-MM-dd HH:mm:ss";

    /**
     * 解析pdf文档信息
     *
     * @param pdfPath pdf文档路径
     * @throws Exception
     */
    public static void pdfParse(String pdfPath) throws Exception {
        InputStream input = null;
        File pdfFile = new File(pdfPath);
        PDDocument document = null;
        try {
            input = new FileInputStream(pdfFile);
            //加载 pdf 文档
            document = PDDocument.load(input);


            /** 文档属性信息
            PDDocumentInformation info = document.getDocumentInformation();
            System.out.println("标题:" + info.getTitle());
            System.out.println("主题:" + info.getSubject());
            System.out.println("作者:" + info.getAuthor());
            System.out.println("关键字:" + info.getKeywords());

            System.out.println("应用程序:" + info.getCreator());
            System.out.println("pdf 制作程序:" + info.getProducer());

            System.out.println("作者:" + info.getTrapped());

            System.out.println("创建时间:" + dateFormat(info.getCreationDate()));
            System.out.println("修改时间:" + dateFormat(info.getModificationDate()));
             **/

            //获取内容信息
            PDFTextStripper pts = new PDFTextStripper();
            String content = pts.getText(document);
            //System.out.println("内容:" + content);
            Console.log(content);
            System.out.println(content);

            // 创建 Pattern 对象

            String pattern = "/(20)?(\\d{2})年(\\d+?)月(\\d+?)(日|号)\\s/";
            Pattern r = Pattern.compile(pattern);
            // 现在创建 matcher 对象
            Matcher m = r.matcher(content);
            if (m.find( )) {

                System.out.println("印发日期: " + m.group() );
            } else {
                System.out.println("NO MATCH");
            }

            pattern = "\\s鄂(.*)(\\d{4})(.*)号\\s";//\s鄂(.*)(\d{4})(.*)号\s
             r = Pattern.compile(pattern);
            // 现在创建 matcher 对象
             m = r.matcher(content);
            if (m.find( )) {

                System.out.println("来文文号: " +getStringNoBlank( m.group().trim())
                        .replace("(","〔")
                        .replace("J","〕") );
            } else {
                System.out.println("来文文号 NO MATCH");
            }

            pattern = "\\s?\\d{6}";//\s?\d{6}
            r = Pattern.compile(pattern);
            // 现在创建 matcher 对象
            m = r.matcher(content);
            if (m.find( )) {

                System.out.println("文件份号: " + m.group().trim() );
            } else {
                System.out.println("文件份号 NO MATCH");
            }

            pattern = "湖北省(.+?)\\s(?![\\s\\S]*?湖北省(.+?)\\s)";//湖北省(.+?)\s(?![\s\S]*?湖北省(.+?)\s)
            r = Pattern.compile(pattern);
            // 现在创建 matcher 对象
            m = r.matcher(content);
            if (m.find( )) {

                System.out.println("来文单位: " + m.group().trim() );
            } else {
                System.out.println("来文单位 NO MATCH");
            }

            pattern = "(?<=号)[\\s\\S]*?(通知|意见|报告)";//(?<=号)[\s\S]*?(通知|意见|报告)
            r = Pattern.compile(pattern);
            // 现在创建 matcher 对象
            m = r.matcher(content);
            if (m.find( )) {

                System.out.println("标题: " + getStringNoBlank(m.group()));
            } else {
                System.out.println("标题 NO MATCH");
            }


            pattern = "\\d{4}[\\s\\S]年[\\s\\S]*日";//  \d{4}[\s\S]年[\s\S]*日
            r = Pattern.compile(pattern);
            // 现在创建 matcher 对象
            m = r.matcher(content);
            if (m.find( )) {

                System.out.println("印发日期: " + getStringNoBlank(m.group()));
            } else {
                System.out.println("印发日期 NO MATCH");
            }

            pattern = "(?<=抄送)[\\s\\S]*?(?=(湖北省(.+?)\\s(?![\\s\\S]*?湖北省(.+?)\\s)))";//  (?<=抄送)[\s\S]*?(?=(湖北省(.+?)\s(?![\s\S]*?湖北省(.+?)\s)))
            r = Pattern.compile(pattern);
            // 现在创建 matcher 对象
            m = r.matcher(content);
            if (m.find( )) {
                System.out.println("抄送: " + getStringNoBlank(m.group()));
            } else {
                System.out.println("主送 NO MATCH");
            }




            /** 文档页面信息 **/
            if(false){//调用百度
            PDDocumentCatalog cata = document.getDocumentCatalog();
            PDPageTree pages = cata.getPages();
            System.out.println("页面:"+pages.getCount());

            // 初始化一个AipOcr
            AipOcr client = new AipOcr(APP_ID, API_KEY, SECRET_KEY);

            // 可选：设置网络连接参数
            client.setConnectionTimeoutInMillis(2000);
            client.setSocketTimeoutInMillis(60000);
            int count = 1;
            for (int i = 0; i < pages.getCount(); i++) {
                PDPage page = (PDPage) pages.get(i);
                if (null != page) {
                    PDResources res = page.getResources();
                    Iterable xobjects = res.getXObjectNames();
                    if (xobjects != null) {
                        Iterator imageIter = xobjects.iterator();
                        while (imageIter.hasNext()) {
                            COSName key = (COSName) imageIter.next();
                            if (res.isImageXObject(key)) {
                                try {
                                    PDImageXObject image = (PDImageXObject) res.getXObject(key);
                                    BufferedImage bimage = image.getImage();
                                    // 将BufferImage转换成字节数组
                                    ByteArrayOutputStream out = new ByteArrayOutputStream();
                                    ImageIO.write(bimage, "png", out);//png 为要保存的图片格式
                                    byte[] barray = out.toByteArray();
                                    out.close();
                                    // 发送图片识别请求
                                    JSONObject json = client.basicGeneral(barray, new HashMap<String, String>());
                                    System.out.println(json.toString(2));
                                    count++;
                                    System.out.println(count);
                                } catch (Exception e) {
                                }
                            }
                        }
                    }
                }
            }
            }





        } catch (Exception e) {
            throw e;
        } finally {
            if (null != input)
                input.close();
            if (null != document)
                document.close();
        }
    }

    /**
     * 获取格式化后的时间信息
     *
     * @param calendar 时间信息
     * @return
     * @throws Exception
     */
    public static String dateFormat(Calendar calendar) throws Exception {
        if (null == calendar)
            return null;
        String date = null;
        try {
            String pattern = DATE_FORMAT;
            SimpleDateFormat format = new SimpleDateFormat(pattern);
            date = format.format(calendar.getTime());
        } catch (Exception e) {
            throw e;
        }
        return date == null ? "" : date;
    }

    public static void main(String[] args) throws Exception {

        // 读取pdf文件
        String path = "/Users/liutaihong/Downloads/收文登记/收文登记01.pdf";
        pdfParse(path);

    }


    public static String getStringNoBlank(String str) {


        if(str!=null && !"".equals(str)) {


            Pattern p = Pattern.compile("\\s*|\t|\r|\n");


            Matcher m = p.matcher(str);


            String strNoBlank = m.replaceAll("");


            return strNoBlank;


        }else {


            return str;


        }


    }

}
